# -------------------------------------------------------------------
# Purpose: Creates Figure B2
# Author:  Max Posch, 25/07/2025
# Usage:   Source this script to generate both panels of the figure
# -------------------------------------------------------------------
# Check that required paths exist
stopifnot(dir.exists(pdataraw))
stopifnot(dir.exists(pdataanalysis))
stopifnot(dir.exists(poutputappendix))

p_unload(tidytable)


## Load data
load(file.path(pdataanalysis, "countyLevel18501940.RData"))
load(file.path(pdataraw, "US_tl2000_cnty_shp_1790_2000.Rdata"))
load(file.path(pdataraw, "US_tl2000_state_shp_1790_2000.Rdata"))

# Top plot

## to address warning about old-style crs
st_crs(shp2000[["1900"]]) <- st_crs(shp2000[["1900"]])
st_crs(state_shp2000[["1900"]]) <- st_crs(state_shp2000[["1900"]])


## Subset data
d <- na.omit(countyLevel18501940[year == 1940,.(gisjoin_1900, entropy_namelast_mp_adjp)])
cnty1900 <- shp2000[["1900"]] %>% 
    filter(ICPSRSTI != 82, ICPSRSTI != 81, ICPSRSTI != 0) %>% 
    select(gisjoin_1900 = GISJOIN, geometry) %>% 
    rmapshaper::ms_simplify(keep = 0.05, keep_shapes = TRUE)
state1900 <- state_shp2000[["1900"]] %>% 
    filter(ICPSRST != 82, ICPSRST != 81) %>% 
    select(geometry) %>% 
    rmapshaper::ms_simplify(keep = 0.05, keep_shapes = TRUE)
dfplot <- left_join(cnty1900, d)


## Create plot
plotname <- file.path(poutputappendix, "figureB02top.pdf")
p <- ggplot() +
    geom_sf(data = dfplot, aes(fill = cut_number(entropy_namelast_mp_adjp, 7)), lwd = 0, colour = NA) +
    coord_sf(crs = st_crs(dfplot), datum = NA) +
    scale_fill_brewer(
        palette = "GnBu", na.value = "grey",
        guide = guide_legend("Surname diversity")
    ) +
    geom_sf(data = state1900, fill = NA, colour = "black", size = 0.1) +
    coord_sf(crs = st_crs(state1900), datum = NA) +
    labs(caption = "1940 surname diversity (entropy) across counties harmonized to 1900 borders") +
    ggthemes::theme_map() +
    theme(legend.position = "right")
ggsave(plotname, p, width = 8, height = 4.6)

cat("Figure B2 left saved to:", plotname, "\n")


# Bottom plot

## Subset data
d <- na.omit(countyLevel18501940[year == 1940, .(gisjoin_1900, entropy_namelast_mp_adjp, n_sum_namelast_mp_adjp)])
d[, entropy_namelast_mp_adjp_r := as.numeric(resid(lm(entropy_namelast_mp_adjp ~ log(n_sum_namelast_mp_adjp), d))) + mean(entropy_namelast_mp_adjp)]

cnty1900 <- shp2000[["1900"]] %>%
    filter(ICPSRSTI != 82, ICPSRSTI != 81, ICPSRSTI != 0) %>%
    select(gisjoin_1900 = GISJOIN, geometry) %>%
    rmapshaper::ms_simplify(keep = 0.05, keep_shapes = TRUE)
state1900 <- state_shp2000[["1900"]] %>%
    filter(ICPSRST != 82, ICPSRST != 81) %>%
    select(geometry) %>%
    rmapshaper::ms_simplify(keep = 0.05, keep_shapes = TRUE)
dfplot <- left_join(cnty1900, d)


## Create plot
plotname <- file.path(poutputappendix, "figureB02bottom.pdf")
p <- ggplot() +
    geom_sf(data = dfplot, aes(fill = cut_number(entropy_namelast_mp_adjp_r, 7)), lwd = 0, colour = NA) +
    coord_sf(crs = st_crs(dfplot), datum = NA) +
    scale_fill_brewer(
        palette = "GnBu", na.value = "grey",
        guide = guide_legend("Surname diversity")
    ) +
    geom_sf(
        data = state1900,
        fill = NA, colour = "black", size = 0.1
    ) +
    coord_sf(crs = st_crs(state1900), datum = NA) +
    labs(caption = "1940 surname diversity (entropy) residualized by log population across counties harmonized to 1900 borders") +
    ggthemes::theme_map() +
    theme(legend.position = "right")
ggsave(plotname, p, width = 8, height = 4.6)

p_load(tidytable)
cat("Figure B2 right saved to:", plotname, "\n")
